Compare cell type-specific gene expression signatures generated from snRNA-seq measured by SMART-seq vs. 10x 3’v3.

Both datasets measured gene expression of brain cell types from middle temporal gyrus (MTG). The SMART-seq data were published earlier (Hodge et al. 2019). The 10x data were downloaded from cellxgene collection of SEA-AD (Seattle Alzheimer’s Disease Brain Cell Atlas) data.

The SMART-seq were processed by codes saved at our GitHub repo scRNAseq_pipelines. We used different clustering methods to examine whether the clustering results are consistent with cell type annotation, and choose those cells whose cluster membership is consistent with cell type annotation to generate gene expression signature.

The 10x data were processed by code step1_SEA_AD.py in this folder. We simply calculated two cell type-specific gene expression measures for each gene. One is the summation of UMI counts across all the cells of one cell type, and the other is mean value of read-depth normalized UMIs.

Setup

Load R libraries

  • git_dir is the local dir for the Github repo scRNAseq_pipelines.
  • tenx_dir is the local dir where saved the 10x gene expressio data generated by step1_SEA_AD.py.
library(data.table)
library(stringr)
library(org.Hs.eg.db)
library(ICeDT)

library(ggplot2)
library(ggpubr)
library(viridis)
library(ggpointdensity)
library(RColorBrewer)

theme_set(theme_classic())

git_dir  = "~/research/GitHub/scRNAseq_pipelines"
tenx_dir = "~/research/data/SEA-AD"

dim1 <- function(x){if(is.vector(x)) length(x) else dim(x)}
is.uique <- function(x){ length(x) == length(unique(x))}

Read in gene expression signature matrix

Read in SMART-seq data

The data were generated by codes in our Github repo scRNAseq_pipelines. Need to read the expression of all the genes in order to calculate TPM.

rds = readRDS(file.path(git_dir, "_brain_cell_type/step3_DE_output", 
                        "all_genes_MTG.rds"))
lapply(rds, dim1)
## $anno
## [1] 37643    13
## 
## $SIG
## [1] 37643     6
## 
## $cor_between_cell_types
## [1] 1
## 
## $cor_between_cell_type_and_gene_length
## [1] 1
## 
## $mean_log_gene_length
## [1] 1
## 
## $sd_log_gene_length
## [1] 1
sm2_data = rds$SIG
sm2_anno = rds$anno
rm(rds)

rds = readRDS(file.path(git_dir, "_brain_cell_type/step3_DE_output", 
                        "signature_MTG.rds"))
lapply(rds, dim1)
## $anno
## [1] 693  10
## 
## $SIG
## [1] 693   6
sm2_markers = rownames(rds$SIG)
rm(rds)

cts = readRDS(file.path(git_dir, "MTG/ct_matrix_human_MTG.rds"))
dim(cts)
## [1] 37657     7
cts[1:2,] 
##           Inh   Exc Oligo OPC Astro Micro Endo
## 5-HT3C2  1615  2610    78 296    16     3    0
## A1BG    16134 54716  6381   0    55   245    0
cts = data.frame(cts)
cts$gene = rownames(cts)

sm2_df = data.frame(sm2_data)
sm2_df$gene = rownames(sm2_data)
sm2_df = merge(sm2_df, cts, by="gene", suffixes=c("", "_count"))
dim(sm2_df)
## [1] 37643    14
sm2_df[1:2,]
##      gene      Astro       Exc       Inh     Micro      Oligo      OPC
## 1 5-HT3C2 0.05574913 0.2499282 0.3910412 0.0483871  0.2516129 1.264957
## 2    A1BG 0.19163763 5.2354799 3.9065375 3.9516129 20.5838710 0.000000
##   Inh_count Exc_count Oligo_count OPC_count Astro_count Micro_count Endo
## 1      1615      2610          78       296          16           3    0
## 2     16134     54716        6381         0          55         245    0
cor(log10(sm2_df$Exc_count), log10(sm2_df$Exc))
## [1] 0.9998821
smrt_df = cts[,c(8,1:7)]
dim(smrt_df)
## [1] 37657     8
smrt_df[1:2,] 
##            gene   Inh   Exc Oligo OPC Astro Micro Endo
## 5-HT3C2 5-HT3C2  1615  2610    78 296    16     3    0
## A1BG       A1BG 16134 54716  6381   0    55   245    0

Read in 10x data

files = list.files(tenx_dir, pattern="\\.csv")
length(files)
## [1] 12
inh_cts = c("Lamp5", "Pvalb", "Sst", "Vip")
exc_cts = c("L2_3_IT", "L4_IT", "L5_IT", "L6_IT")

cell_types = gsub("_genes_info.csv", "", files, fixed = TRUE)
cell_types
##  [1] "Astro"     "L2_3_IT"   "L4_IT"     "L5_IT"     "L6_IT"     "Lamp5"    
##  [7] "Micro-PVM" "Oligo"     "OPC"       "Pvalb"     "Sst"       "Vip"
dat_10x = list()

for(i in 1:length(files)){
  f1 = files[i]
  ct = cell_types[i]
  if(ct %in% inh_cts){ ct = paste0("Inh_", ct) }
  if(ct %in% exc_cts){ ct = paste0("Exc_", ct) }
  
  dat_10x[[ct]] = fread(file.path(tenx_dir, f1))
  
  if(i == 1){
    tenx_df = dat_10x[[ct]][,c("gene_ids", "feature_name")]
  }else{
    stopifnot(all(tenx_df$gene_ids == dat_10x[[ct]]$gene_ids))
  }
  tenx_df[[ct]] = dat_10x[[ct]]$total_counts
}

dim(tenx_df)
## [1] 36517    14
tenx_df[1:2,]
##           gene_ids feature_name Astro Exc_L2_3_IT Exc_L4_IT Exc_L5_IT Exc_L6_IT
## 1: ENSG00000000003       TSPAN6  8310        6458      3049      3142      1146
## 2: ENSG00000000005         TNMD   174        1104       496       496        70
##    Inh_Lamp5 Micro-PVM Oligo  OPC Inh_Pvalb Inh_Sst Inh_Vip
## 1:       343        94   373 2574      2222     463    3149
## 2:        23         1     8   89       225      65     210
L_23 = dat_10x$Exc_L2_3_IT
dim(L_23)
## [1] 36517    11
L_23[1:2,]
##           gene_ids feature_is_filtered feature_name feature_reference
## 1: ENSG00000000003               FALSE       TSPAN6    NCBITaxon:9606
## 2: ENSG00000000005               FALSE         TNMD    NCBITaxon:9606
##    feature_biotype n_cells_by_counts mean_counts pct_dropout_by_counts
## 1:            gene              6086 0.019564658              98.15623
## 2:            gene              1024 0.003344593              99.68978
##    total_counts mean_rd_normalized_count var_rd_normalized_count
## 1:         6458              0.006393405            0.0034943754
## 2:         1104              0.001037519            0.0005624617
L_23 = L_23[which(L_23$pct_dropout_by_counts <= 98),]
dim(L_23)
## [1] 19530    11
ggplot(L_23, aes(x=log10(total_counts), 
                 y=log10(mean_rd_normalized_count))) +
  geom_point(alpha=0.5, color="orange")

ggplot(L_23, aes(x=log10(mean_rd_normalized_count), 
                 y=log10(var_rd_normalized_count))) +
  geom_point(alpha=0.5, color="orange")

Read in gene annoation

gtf_fn = file.path(git_dir, "MTG/GTF2/gene_info_v26.rds")
gtf = readRDS(gtf_fn)
gtf[1:2,]
##    Chr     V2   V3 Start   End V6 V7 V8              ENSG    GENE
## 1 chr1 HAVANA gene 11869 14403  .  +  . ENSG00000223972.5 DDX11L1
## 7 chr1 HAVANA gene 14410 29553  .  -  . ENSG00000227232.5  WASH7P
exon_fn = file.path(git_dir, 
                    "MTG/GTF2/exon_by_genes_gencode_v26.rds")
exon = readRDS(exon_fn)

dim(gtf)
## [1] 56292    10
length(exon)
## Loading required package: GenomicRanges
## Loading required package: GenomeInfoDb
## [1] 56292
stopifnot(setequal(names(exon), gtf$ENSG))

gtf = gtf[match(names(exon), gtf$ENSG),]
gene_len = sum(BiocGenerics::width(GenomicRanges::reduce(exon)))
gtf$gene_length = as.numeric(gene_len)
rm(exon)

gtf = gtf[which(gtf$Chr %in% paste0("chr",1:22)),]
gtf$ChrNum = as.integer(gsub("chr","",gtf$Chr))
gtf = gtf[order(gtf$ChrNum, gtf$Start),]

gtf$ENSG_version = gtf$ENSG
gtf$ENSG = gsub("\\..*","", gtf$ENSG_version)

dim(gtf)
## [1] 53288    13
gtf[1:2,]
##    Chr     V2   V3 Start   End V6 V7 V8            ENSG    GENE gene_length
## 1 chr1 HAVANA gene 11869 14403  .  +  . ENSG00000223972 DDX11L1        1729
## 7 chr1 HAVANA gene 14410 29553  .  -  . ENSG00000227232  WASH7P        1328
##   ChrNum      ENSG_version
## 1      1 ENSG00000223972.5
## 7      1 ENSG00000227232.5

Check the relation between gene length and expression

SMART-seq data

gene_anno = as.data.frame(sm2_anno[,c("gene", "chromosome", 
                                      "entrez_id")])
dim(gene_anno)
## [1] 37643     3
gene_anno[1:2,]
##            gene chromosome entrez_id
## 5-HT3C2 5-HT3C2          3    389180
## A1BG       A1BG         19         1
length(unique(gene_anno$gene))
## [1] 37643
map1 = mapIds(org.Hs.eg.db, keys=as.character(gene_anno$entrez_id), 
              column='ENSEMBL', keytype='ENTREZID')
## 'select()' returned 1:many mapping between keys and columns
map1[1:5]
##            389180                 1            503538             29974 
##                NA "ENSG00000121410" "ENSG00000268895" "ENSG00000148584" 
##                 2 
## "ENSG00000175899"
stopifnot(names(map1) == gene_anno$entrez_id)
gene_anno$ENSG = as.character(map1)
table(is.na(gene_anno$ENSG))
## 
## FALSE  TRUE 
## 24529 13114
gene_anno = gene_anno[which(!is.na(gene_anno$ENSG)),]
t1 = table(gene_anno$ENSG)
table(t1)
## t1
##     1     2     3 
## 24395    64     2
t1[t1 > 2]
## 
## ENSG00000186645 ENSG00000223972 
##               3               3
gene_anno[gene_anno$ENSG == names(t1[t1 > 2])[1],]
##              gene chromosome entrez_id            ENSG
## SPDYE10P SPDYE10P          7    643862 ENSG00000186645
## SPDYE12P SPDYE12P          7 100101268 ENSG00000186645
## SPDYE17   SPDYE17          7 102723849 ENSG00000186645
non_uniq_ENSG = names(t1)[t1 > 1]

gene_anno = gene_anno[-which(gene_anno$ENSG %in% non_uniq_ENSG),]
dim(gene_anno)
## [1] 24395     4
gene_anno[1:2,]
##              gene chromosome entrez_id            ENSG
## A1BG         A1BG         19         1 ENSG00000121410
## A1BG-AS1 A1BG-AS1         19    503538 ENSG00000268895
table(gene_anno$ENSG %in% gtf$ENSG)
## 
## FALSE  TRUE 
##  1371 23024
gene_anno = merge(gene_anno, gtf, by="ENSG")
gene_anno = gene_anno[,c("ENSG", "gene", "Chr", "Start", "End", "gene_length")]
dim(gene_anno)
## [1] 23024     6
gene_anno[1:2,]
##              ENSG  gene   Chr     Start       End gene_length
## 1 ENSG00000000419  DPM1 chr20  50934867  50958555        1207
## 2 ENSG00000000457 SCYL3  chr1 169849631 169894267        5586
smrt_df = merge(smrt_df, gene_anno, by="gene")
dim(smrt_df)
## [1] 23024    13
smrt_df[1:2,]
##       gene   Inh   Exc Oligo OPC Astro Micro Endo            ENSG   Chr
## 1     A1BG 16134 54716  6381   0    55   245    0 ENSG00000121410 chr19
## 2 A1BG-AS1  2760  8445   359   1     1     1    0 ENSG00000268895 chr19
##      Start      End gene_length
## 1 58345178 58353499        2629
## 2 58347751 58354356        1411
cts_smrt = c("Astro", "Exc", "Inh", "Micro", "Oligo", "OPC")
cr1 = cor(smrt_df$gene_length, smrt_df[,cts_smrt], method="spearman")
cr2 = cor(smrt_df$gene_length, smrt_df[,cts_smrt]/smrt_df$gene_length, 
          method="spearman")

glist = list()

for(ct_i in cts_smrt){
  glist[[ct_i]] = ggplot(smrt_df, 
                    aes_string(x="log10(gene_length)", 
                               y=sprintf("log10(%s)", ct_i))) + 
    geom_pointdensity() + scale_color_viridis() + 
    ggtitle(sprintf("%s, Sp. corr: %.2f/%.2f", ct_i, 
                    cr1[,ct_i], cr2[,ct_i])) + 
    theme(legend.position = "none")
}

ggarrange(plotlist=glist, nrow = 2, ncol = 3)

10x data

dim(tenx_df)
## [1] 36517    14
cols2kp = c("ENSG", "Chr", "Start", "End", "gene_length")
tenx_df = merge(tenx_df, gtf[,cols2kp], by.x="gene_ids", by.y="ENSG")
names(tenx_df)[1:2] = c("ENSG", "gene")
names(tenx_df)[which(names(tenx_df) == "Micro-PVM")] = "Micro"

dim(tenx_df)
## [1] 32320    18
tenx_df[1:2,]
##               ENSG  gene Astro Exc_L2_3_IT Exc_L4_IT Exc_L5_IT Exc_L6_IT
## 1: ENSG00000000419  DPM1 32612      550753    205876    213916    110076
## 2: ENSG00000000457 SCYL3 19382      241470     81828    113964     45380
##    Inh_Lamp5 Micro Oligo   OPC Inh_Pvalb Inh_Sst Inh_Vip   Chr     Start
## 1:     27030  9424 31439 11798     90418   32660   67887 chr20  50934867
## 2:     15370  7634 16463 10702     40361   16518   32396  chr1 169849631
##          End gene_length
## 1:  50958555        1207
## 2: 169894267        5586
cts_10x = names(tenx_df)[3:14]
cts_10x
##  [1] "Astro"       "Exc_L2_3_IT" "Exc_L4_IT"   "Exc_L5_IT"   "Exc_L6_IT"  
##  [6] "Inh_Lamp5"   "Micro"       "Oligo"       "OPC"         "Inh_Pvalb"  
## [11] "Inh_Sst"     "Inh_Vip"
glist = list()

for(i in 1:length(cts_10x)){
  ct1 = cts_10x[i]
  cr1 = cor(tenx_df$gene_length, tenx_df[[ct1]], method="spearman")
  cr2 = cor(tenx_df$gene_length, tenx_df[[ct1]]/tenx_df$gene_length, 
            method="spearman")

  glist[[i]] = ggplot(tenx_df, aes_string(x="log10(gene_length)", 
                 y=sprintf("log10(%s)", ct1))) + 
    geom_pointdensity() + scale_color_viridis() + 
    ggtitle(sprintf("%s, Sp. corr: %.2f/%.2f", ct_i, cr1, cr2)) + 
    theme(legend.position = "none")
}

ggarrange(plotlist=glist, nrow = 4, ncol = 3)

Check correlation between two datasests

Using refined excitatory and inhibitory neuron types

theme1 = theme(axis.text.x = element_text(angle = 60, vjust = 1, 
                                   size = 11, hjust = 1),
               axis.text.y = element_text(size = 11))
stopifnot(is.uique(smrt_df$ENSG))
stopifnot(is.uique(tenx_df$ENSG))

dat_combined = merge(smrt_df, tenx_df, by="ENSG", 
                     suffixes = c("","_10x"))
dim(dat_combined)
## [1] 19179    30
dat_combined[1:2,]
##              ENSG  gene    Inh    Exc Oligo   OPC Astro Micro Endo   Chr
## 1 ENSG00000000419  DPM1 301355 683217 16911 13857 11489  1944  925 chr20
## 2 ENSG00000000457 SCYL3  55202 145707  3320  3051  6389  1091    0  chr1
##       Start       End gene_length gene_10x Astro_10x Exc_L2_3_IT Exc_L4_IT
## 1  50934867  50958555        1207     DPM1     32612      550753    205876
## 2 169849631 169894267        5586    SCYL3     19382      241470     81828
##   Exc_L5_IT Exc_L6_IT Inh_Lamp5 Micro_10x Oligo_10x OPC_10x Inh_Pvalb Inh_Sst
## 1    213916    110076     27030      9424     31439   11798     90418   32660
## 2    113964     45380     15370      7634     16463   10702     40361   16518
##   Inh_Vip Chr_10x Start_10x   End_10x gene_length_10x
## 1   67887   chr20  50934867  50958555            1207
## 2   32396    chr1 169849631 169894267            5586
table(dat_combined$gene == dat_combined$gene_10x)
## 
## FALSE  TRUE 
##  2256 16923
cts_10x_adj = cts_10x
ww1 = which(cts_10x %in% cts_smrt)
cts_10x_adj[ww1] = paste0(cts_10x[ww1], "_10x")

cr_mat = cor(dat_combined[,cts_smrt], dat_combined[,cts_10x_adj], 
             method="spearman")
round(cr_mat, 2)
##       Astro_10x Exc_L2_3_IT Exc_L4_IT Exc_L5_IT Exc_L6_IT Inh_Lamp5 Micro_10x
## Astro      0.84        0.64      0.65      0.65      0.65      0.68      0.67
## Exc        0.72        0.86      0.86      0.86      0.86      0.82      0.69
## Inh        0.73        0.80      0.81      0.81      0.81      0.85      0.70
## Micro      0.57        0.53      0.53      0.53      0.53      0.55      0.76
## Oligo      0.69        0.66      0.67      0.66      0.66      0.69      0.68
## OPC        0.75        0.70      0.71      0.71      0.71      0.74      0.69
##       Oligo_10x OPC_10x Inh_Pvalb Inh_Sst Inh_Vip
## Astro      0.73    0.74      0.67    0.67    0.67
## Exc        0.75    0.76      0.82    0.83    0.81
## Inh        0.76    0.78      0.85    0.86    0.85
## Micro      0.61    0.57      0.54    0.55    0.54
## Oligo      0.85    0.72      0.68    0.68    0.68
## OPC        0.76    0.85      0.73    0.74    0.74
length(sm2_markers)
## [1] 693
sm2_markers[1:4]
## [1] "ABAT"  "ABCA2" "ABCA8" "ABCC4"
table(sm2_markers %in% dat_combined$gene)
## 
## FALSE  TRUE 
##    23   670
dat_markers = dat_combined[which(dat_combined$gene %in% sm2_markers),]
dim(dat_markers)
## [1] 670  30
dat_markers[1:2,]
##               ENSG  gene    Inh     Exc Oligo   OPC Astro Micro Endo  Chr
## 23 ENSG00000002746 HECW1  58969  575538   314  6366  1604     0    0 chr7
## 30 ENSG00000003147  ICA1 898864 2591450   334 36952   164  2096  495 chr7
##       Start      End gene_length gene_10x Astro_10x Exc_L2_3_IT Exc_L4_IT
## 23 43112599 43566001       13535    HECW1     20254     7998995   4085283
## 30  8113184  8262579        3968     ICA1      5292     2037354    662684
##    Exc_L5_IT Exc_L6_IT Inh_Lamp5 Micro_10x Oligo_10x OPC_10x Inh_Pvalb Inh_Sst
## 23   4809762   1606772    106715     10481     24589  168726    949779  129909
## 30    796024    379456    131221     14058      7838   30508    406668  155240
##    Inh_Vip Chr_10x Start_10x  End_10x gene_length_10x
## 23  435460    chr7  43112599 43566001           13535
## 30  175158    chr7   8113184  8262579            3968
cr_mat = cor(dat_markers[,cts_smrt], dat_markers[,cts_10x_adj], 
             method="spearman")
colnames(cr_mat) = gsub("_10x", "", colnames(cr_mat))

cr_mat = cr_mat[,order(colnames(cr_mat))]
round(cr_mat, 2)
##       Astro Exc_L2_3_IT Exc_L4_IT Exc_L5_IT Exc_L6_IT Inh_Lamp5 Inh_Pvalb
## Astro  0.85        0.27      0.25      0.24      0.26      0.25      0.26
## Exc    0.38        0.88      0.89      0.89      0.88      0.67      0.68
## Inh    0.36        0.64      0.66      0.65      0.63      0.85      0.81
## Micro -0.09        0.01      0.01      0.01      0.01     -0.05     -0.03
## Oligo  0.27        0.37      0.38      0.36      0.38      0.36      0.36
## OPC    0.42        0.46      0.46      0.44      0.44      0.53      0.51
##       Inh_Sst Inh_Vip Micro Oligo   OPC
## Astro    0.22    0.27  0.04  0.24  0.35
## Exc      0.70    0.69  0.23  0.47  0.51
## Inh      0.86    0.87  0.14  0.45  0.57
## Micro   -0.06   -0.07  0.75  0.05 -0.12
## Oligo    0.38    0.37  0.08  0.86  0.37
## OPC      0.50    0.54  0.08  0.45  0.87
melted_cr_mat = reshape2::melt(cr_mat, value.name = "corr", 
                               varnames=c("cell type SMART-seq",
                                          "cell type 10x"))
dim(melted_cr_mat)
## [1] 72  3
melted_cr_mat[1:2,]
##   cell type SMART-seq cell type 10x      corr
## 1               Astro         Astro 0.8540557
## 2                 Exc         Astro 0.3792698
ggplot(data = melted_cr_mat, 
       aes(x=`cell type 10x`, y=`cell type SMART-seq`, fill=corr)) + 
  geom_tile() + theme_classic() + 
  scale_fill_gradient(low = "white", high = "red") + theme1

Using collapsed excitatory and inhibitory neuron types

exc_cts = c("L2_3_IT", "L4_IT", "L5_IT", "L6_IT")
inh_cts = c("Lamp5", "Pvalb", "Sst", "Vip")

dat_combined[["Exc_10x"]] = rowSums(dat_combined[,paste0("Exc_", exc_cts)])
dat_combined[["Inh_10x"]] = rowSums(dat_combined[,paste0("Inh_", inh_cts)])

cts_10x_combined = paste0(cts_smrt, "_10x")

dat_markers = dat_combined[which(dat_combined$gene %in% sm2_markers),]
dim(dat_markers)
## [1] 670  32
dat_markers[1:2,]
##               ENSG  gene    Inh     Exc Oligo   OPC Astro Micro Endo  Chr
## 23 ENSG00000002746 HECW1  58969  575538   314  6366  1604     0    0 chr7
## 30 ENSG00000003147  ICA1 898864 2591450   334 36952   164  2096  495 chr7
##       Start      End gene_length gene_10x Astro_10x Exc_L2_3_IT Exc_L4_IT
## 23 43112599 43566001       13535    HECW1     20254     7998995   4085283
## 30  8113184  8262579        3968     ICA1      5292     2037354    662684
##    Exc_L5_IT Exc_L6_IT Inh_Lamp5 Micro_10x Oligo_10x OPC_10x Inh_Pvalb Inh_Sst
## 23   4809762   1606772    106715     10481     24589  168726    949779  129909
## 30    796024    379456    131221     14058      7838   30508    406668  155240
##    Inh_Vip Chr_10x Start_10x  End_10x gene_length_10x  Exc_10x Inh_10x
## 23  435460    chr7  43112599 43566001           13535 18500812 1621863
## 30  175158    chr7   8113184  8262579            3968  3875518  868287
cr_mat = cor(dat_markers[,cts_smrt], dat_markers[,cts_10x_combined], 
             method="spearman")
colnames(cr_mat) = gsub("_10x", "", colnames(cr_mat))

cr_mat = cr_mat[,order(colnames(cr_mat))]
round(cr_mat, 2)
##       Astro  Exc   Inh Micro Oligo   OPC
## Astro  0.85 0.25  0.25  0.04  0.24  0.35
## Exc    0.38 0.90  0.68  0.23  0.47  0.51
## Inh    0.36 0.65  0.89  0.14  0.45  0.57
## Micro -0.09 0.00 -0.08  0.75  0.05 -0.12
## Oligo  0.27 0.36  0.35  0.08  0.86  0.37
## OPC    0.42 0.46  0.53  0.08  0.45  0.87
melted_cr_mat = reshape2::melt(cr_mat, value.name = "corr", 
                               varnames=c("cell type SMART-seq",
                                          "cell type 10x"))
dim(melted_cr_mat)
## [1] 36  3
melted_cr_mat[1:2,]
##   cell type SMART-seq cell type 10x      corr
## 1               Astro         Astro 0.8540557
## 2                 Exc         Astro 0.3792698
ggplot(data = melted_cr_mat, 
       aes(x=`cell type 10x`, y=`cell type SMART-seq`, fill=corr)) + 
  geom_tile() + theme_classic() + 
  scale_fill_gradient(low = "white", high = "red") + theme1

Check a few scatter plot

for(ct1 in cts_smrt){
  g1 = ggplot(dat_combined, aes_string(x=sprintf("log10(%s)", ct1), 
                   y=sprintf("log10(%s_10x)", ct1))) +
    geom_point(alpha=0.5, color="orange") + 
    geom_abline(intercept = 0, slope=1, col="red")
  print(g1)
}

Deconvolution

Read in bulk RNA-seq data

bulk = readRDS("../../CSeQTL/s7_gtexBrain/counts/trec_hap1_hap2_QCres.rds")
sapply(bulk, dim1)
## $trec
## [1] 56200   174
## 
## $hap1
## [1] 56200   174
## 
## $hap2
## [1] 56200   174
## 
## $filt
## [1] 3
## 
## $rm_subj
## [1] 1
bulk = bulk$trec

dim(bulk)
## [1] 56200   174
bulk[1:2,1:3]
##                    GTEX-1192X GTEX-11DYG GTEX-11DZ1
## ENSG00000000003.14        337        365        232
## ENSG00000000005.5           9          6          7
rownames(bulk) = gsub("\\..*", "", rownames(bulk))

dim(dat_combined)
## [1] 19179    32
table(dat_combined$ENSG %in% rownames(bulk))
## 
##  TRUE 
## 19179
bulk = bulk[match(dat_combined$ENSG, rownames(bulk)),]
dim(bulk)
## [1] 19179   174

Calculate TPM using all available genes

num_genes_tpm = nrow(bulk); num_genes_tpm
## [1] 19179
get_tpm <- function(dat_bulk, gene_length){
  dat_tpm   = apply(dat_bulk,2,function(xx) xx / gene_length)
  dat_tpm   = 1e6 * apply(dat_tpm,2,function(xx) xx / sum(xx))
  dat_tpm
}

bulk_tpm    = get_tpm(bulk, dat_combined$gene_length)

dim(bulk)
## [1] 19179   174
dim(bulk_tpm)
## [1] 19179   174
bulk[1:2,1:3]
##                 GTEX-1192X GTEX-11DYG GTEX-11DZ1
## ENSG00000000419        879        719        421
## ENSG00000000457        342        366        151
bulk_tpm[1:2,1:3]
##                 GTEX-1192X GTEX-11DYG GTEX-11DZ1
## ENSG00000000419  99.651727  73.235920  89.624590
## ENSG00000000457   8.377769   8.055318   6.945898
dim(dat_combined)
## [1] 19179    32
dat_combined[1:2,]
##              ENSG  gene    Inh    Exc Oligo   OPC Astro Micro Endo   Chr
## 1 ENSG00000000419  DPM1 301355 683217 16911 13857 11489  1944  925 chr20
## 2 ENSG00000000457 SCYL3  55202 145707  3320  3051  6389  1091    0  chr1
##       Start       End gene_length gene_10x Astro_10x Exc_L2_3_IT Exc_L4_IT
## 1  50934867  50958555        1207     DPM1     32612      550753    205876
## 2 169849631 169894267        5586    SCYL3     19382      241470     81828
##   Exc_L5_IT Exc_L6_IT Inh_Lamp5 Micro_10x Oligo_10x OPC_10x Inh_Pvalb Inh_Sst
## 1    213916    110076     27030      9424     31439   11798     90418   32660
## 2    113964     45380     15370      7634     16463   10702     40361   16518
##   Inh_Vip Chr_10x Start_10x   End_10x gene_length_10x Exc_10x Inh_10x
## 1   67887   chr20  50934867  50958555            1207 1080621  217995
## 2   32396    chr1 169849631 169894267            5586  482642  104645
cell_types = c("Astro", "Exc", "Inh", "Micro", "Oligo", "OPC")
cell_types_10x = paste0(cell_types, "_10x")
  
sig_sm2 = dat_combined[,cell_types]
sig_10x = dat_combined[,cell_types_10x]

dim(sig_sm2)
## [1] 19179     6
sig_sm2[1:2,]
##   Astro    Exc    Inh Micro Oligo   OPC
## 1 11489 683217 301355  1944 16911 13857
## 2  6389 145707  55202  1091  3320  3051
dim(sig_10x)
## [1] 19179     6
sig_10x[1:2,]
##   Astro_10x Exc_10x Inh_10x Micro_10x Oligo_10x OPC_10x
## 1     32612 1080621  217995      9424     31439   11798
## 2     19382  482642  104645      7634     16463   10702
rownames(sig_sm2) = dat_combined$ENSG
rownames(sig_10x) = dat_combined$ENSG

length(sm2_markers)
## [1] 693
sm2_markers[1:5]
## [1] "ABAT"  "ABCA2" "ABCA8" "ABCC4" "ACACB"
table(sm2_markers %in% dat_combined$gene)
## 
## FALSE  TRUE 
##    23   670
markers = dat_combined$ENSG[dat_combined$gene %in% sm2_markers]
length(markers)
## [1] 670
stopifnot(all(markers %in% rownames(sig_sm2)))
stopifnot(all(markers %in% rownames(bulk_tpm)))

sig_sm2_tpm = get_tpm(sig_sm2, dat_combined$gene_length)
sig_10x_tpm = get_tpm(sig_10x, dat_combined$gene_length)

sig_sm2_tpm = sig_sm2_tpm[match(markers, rownames(sig_sm2_tpm)),]
sig_10x_tpm = sig_10x_tpm[match(markers, rownames(sig_10x_tpm)),]

bulk_tpm    = bulk_tpm[match(markers, rownames(bulk_tpm)),]

dim(bulk_tpm)
## [1] 670 174
dim(sig_sm2_tpm)
## [1] 670   6
res_sm2 = ICeDT(bulk_tpm, sig_sm2_tpm, tumorPurity=rep(0,ncol(bulk_tpm)))
## Adding 1e-5 to Y to ensure valid log transformation.
## Adding 1e-5 to Z to ensure valid log transformation.
## Iter 1: max diff of rho: 0.341202055892905
## .........
res_10x = ICeDT(bulk_tpm, sig_10x_tpm, tumorPurity=rep(0,ncol(bulk_tpm)))
## Adding 1e-5 to Y to ensure valid log transformation.
## Iter 1: max diff of rho: 0.635354680664219
## ........
rho_sm2 = t(res_sm2$rho)
rho_10x = t(res_10x$rho)

dim(rho_sm2)
## [1] 174   7
rho_sm2[1:2,]
##            tumor     Astro       Exc       Inh       Micro      Oligo
## GTEX-1192X     0 0.2004574 0.6367593 0.0732894 0.016181474 0.03112912
## GTEX-11DYG     0 0.1021826 0.5752583 0.1400363 0.005418761 0.10679713
##                   OPC
## GTEX-1192X 0.04218334
## GTEX-11DYG 0.07030687
dim(rho_10x)
## [1] 174   7
rho_10x[1:2,]
##            tumor Astro_10x   Exc_10x    Inh_10x  Micro_10x  Oligo_10x
## GTEX-1192X     0 0.2541224 0.4935282 0.09500614 0.04143025 0.05945821
## GTEX-11DYG     0 0.1440168 0.4220597 0.15709661 0.01607289 0.17249499
##               OPC_10x
## GTEX-1192X 0.05645483
## GTEX-11DYG 0.08825899
rhos = cbind(rho_sm2[,-1], rho_10x[,-1])
rhos = as.data.frame(rhos)
dim(rhos)
## [1] 174  12
glist = list()
cols = brewer.pal(n = length(cell_types), name = "Dark2")

for(i in 1:length(cell_types)){
  cr_i = cor(rhos[[cell_types[i]]], 
              rhos[[cell_types_10x[i]]], 
              method="spearman")
  
  glist[[cell_types[i]]] = 
    ggplot(rhos, aes_string(x=cell_types[i], 
                            y=cell_types_10x[i])) + 
    geom_point(alpha=0.5, col=cols[i]) + 
    labs(title=sprintf("%s corr=%.3f", cell_types[i], cr_i), 
         x = "SMART-seq reference", 
         y = "10x reference")
}

ggarrange(plotlist=glist, nrow = 2, ncol = 3)

Session information

gc()
##            used  (Mb) gc trigger  (Mb) limit (Mb) max used  (Mb)
## Ncells  4614209 246.5    7367025 393.5         NA  7367025 393.5
## Vcells 20210080 154.2   63997930 488.3      32768 63997930 488.3
sessionInfo()
## R version 4.1.2 (2021-11-01)
## Platform: x86_64-apple-darwin17.0 (64-bit)
## Running under: macOS Big Sur 10.16
## 
## Matrix products: default
## BLAS:   /Library/Frameworks/R.framework/Versions/4.1/Resources/lib/libRblas.0.dylib
## LAPACK: /Library/Frameworks/R.framework/Versions/4.1/Resources/lib/libRlapack.dylib
## 
## locale:
## [1] en_US.UTF-8/en_US.UTF-8/en_US.UTF-8/C/en_US.UTF-8/en_US.UTF-8
## 
## attached base packages:
## [1] stats4    stats     graphics  grDevices utils     datasets  methods  
## [8] base     
## 
## other attached packages:
##  [1] GenomicRanges_1.46.1 GenomeInfoDb_1.30.0  RColorBrewer_1.1-2  
##  [4] ggpointdensity_0.1.0 viridis_0.6.2        viridisLite_0.4.0   
##  [7] ggpubr_0.4.0         ggplot2_3.4.0        ICeDT_0.99.1        
## [10] gtools_3.9.2         alabama_2022.4-1     numDeriv_2016.8-1.1 
## [13] org.Hs.eg.db_3.14.0  AnnotationDbi_1.56.2 IRanges_2.28.0      
## [16] S4Vectors_0.32.3     Biobase_2.54.0       BiocGenerics_0.40.0 
## [19] stringr_1.4.0        data.table_1.14.2   
## 
## loaded via a namespace (and not attached):
##  [1] httr_1.4.2             sass_0.4.0             tidyr_1.1.4           
##  [4] bit64_4.0.5            jsonlite_1.7.2         carData_3.0-4         
##  [7] bslib_0.3.1            assertthat_0.2.1       highr_0.9             
## [10] blob_1.2.2             GenomeInfoDbData_1.2.7 yaml_2.2.1            
## [13] pillar_1.6.4           RSQLite_2.2.8          backports_1.4.0       
## [16] glue_1.6.2             digest_0.6.29          XVector_0.34.0        
## [19] ggsignif_0.6.3         colorspace_2.0-2       plyr_1.8.6            
## [22] cowplot_1.1.1          htmltools_0.5.2        pkgconfig_2.0.3       
## [25] broom_0.7.10           zlibbioc_1.40.0        purrr_0.3.4           
## [28] scales_1.2.1           tibble_3.1.6           KEGGREST_1.34.0       
## [31] farver_2.1.0           generics_0.1.1         car_3.0-12            
## [34] ellipsis_0.3.2         cachem_1.0.6           withr_2.5.0           
## [37] cli_3.4.1              magrittr_2.0.1         crayon_1.4.2          
## [40] memoise_2.0.1          evaluate_0.14          fansi_0.5.0           
## [43] rstatix_0.7.0          tools_4.1.2            lifecycle_1.0.3       
## [46] munsell_0.5.0          Biostrings_2.62.0      compiler_4.1.2        
## [49] jquerylib_0.1.4        rlang_1.0.6            grid_4.1.2            
## [52] RCurl_1.98-1.5         rstudioapi_0.13        labeling_0.4.2        
## [55] bitops_1.0-7           rmarkdown_2.11         gtable_0.3.0          
## [58] abind_1.4-5            DBI_1.1.1              reshape2_1.4.4        
## [61] R6_2.5.1               gridExtra_2.3          knitr_1.36            
## [64] dplyr_1.0.7            fastmap_1.1.0          bit_4.0.4             
## [67] utf8_1.2.2             stringi_1.7.6          Rcpp_1.0.7            
## [70] vctrs_0.5.1            png_0.1-7              tidyselect_1.1.1      
## [73] xfun_0.28

References

Hodge, Rebecca D, Trygve E Bakken, Jeremy A Miller, Kimberly A Smith, Eliza R Barkan, Lucas T Graybuck, Jennie L Close, et al. 2019. “Conserved Cell Types with Divergent Features in Human Versus Mouse Cortex.” Nature 573 (7772): 61–68.